In [1]:
import preprocess
import visualize
import selection
import relevance
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn import tree
from genetic_selection import GeneticSelectionCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import lime
import lime.lime_tabular
import shap
from lime import submodular_pick
from eli5 import show_weights, show_prediction, explain_weights_dfs, explain_prediction_dfs
import warnings
warnings.filterwarnings("ignore")
In [2]:
# load data for personal internet use disorder
data = preprocess.load_piu_data()
data.head()
Out[2]:
| Gender | Achievement | Economic status | Internet Use (in years) | Internet Use (hours per week) | Internet Use (hours per day) | Internet Use (in holiday) | Attitude about time on the Internet | Politics | Business | ... | Coffee | Alcohol | Drepressive temperament | Cyclothymic temperament | Hyperthymic temperament | Irritable temperament | Anxiety temperament | PIU | PIUcutoff | Cutoff_Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 3.0 | 3.0 | 9.0 | 1.0 | 1.0 | 2.0 | 0.0 | 5.0 | 1.0 | ... | 1.0 | 1.0 | 0.857143 | 0.571429 | 0.500 | 0.571429 | 0.714286 | NaN | NaN | NaN |
| 1 | 0.0 | 4.0 | 4.0 | 5.0 | 0.0 | 0.0 | 2.0 | 0.0 | 2.0 | 1.0 | ... | 0.0 | 0.0 | 0.285714 | 0.857143 | 0.875 | 0.571429 | 0.571429 | 25.0 | 0.0 | 0.0 |
| 2 | 0.0 | 4.0 | 3.0 | 9.0 | 1.0 | 1.0 | 1.0 | 0.0 | 2.0 | 3.0 | ... | 0.0 | 0.0 | 0.142857 | 0.142857 | 1.000 | 0.142857 | 0.000000 | 28.0 | 0.0 | 0.0 |
| 3 | 0.0 | 4.0 | 3.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 2.0 | ... | 0.0 | 0.0 | 0.285714 | 0.142857 | 1.000 | 0.714286 | 0.428571 | 31.0 | 0.0 | 0.0 |
| 4 | 0.0 | 4.0 | 1.0 | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 3.0 | ... | NaN | 0.0 | 0.428571 | 0.428571 | 0.625 | 0.857143 | 0.714286 | 28.0 | 0.0 | 0.0 |
5 rows × 59 columns
In [3]:
# print column names to check if everything is OK
print(data.columns)
Index(['Gender', 'Achievement', 'Economic status', 'Internet Use (in years)',
'Internet Use (hours per week)', 'Internet Use (hours per day)',
'Internet Use (in holiday)', 'Attitude about time on the Internet',
'Politics', 'Business', 'Sports', 'Computers and technology',
'Arts and culture', 'Education', 'Pop culture', 'Pornography', 'Music',
'Travel/tourism', 'Health and medicine', 'Science', 'Religion',
'Communication by e-mail', 'Social networks',
'Communication on the forum', 'Communication on the blog',
'Targeted Internet search', 'Surfing', 'Expert Advice',
'Search for favorite websites', 'Reading the news', 'Online games',
'Reading and downloading books and texts',
'Downloading music and movies', 'Internet for school', 'Online courses',
'Everyday FB use', 'Average time spent on FB', 'FB use - reading posts',
'FB use - publishing statuses', 'FB use - sharing music, photos etc.',
'FB use –gaming', 'FB use – chatting', 'FB use – visiting groups',
'Sports – days in a week', 'Sports – intensity', 'Sports – in minutes',
'Energy drinks', 'Fast Food', 'Smoker', 'Coffee', 'Alcohol',
'Drepressive temperament', 'Cyclothymic temperament',
'Hyperthymic temperament', 'Irritable temperament',
'Anxiety temperament', 'PIU', 'PIUcutoff', 'Cutoff_Class'],
dtype='object')
In [4]:
# check columns with NaN values
print(data.isna().sum().sort_values(ascending=False))
Cutoff_Class 104 PIUcutoff 104 PIU 104 Average time spent on FB 95 Internet Use (in years) 91 Sports – in minutes 86 Internet Use (hours per day) 72 Communication by e-mail 60 Communication on the blog 59 Sports – intensity 55 Online games 54 Online courses 54 Pornography 53 Communication on the forum 52 Expert Advice 49 Reading and downloading books and texts 48 Fast Food 46 Religion 45 Travel/tourism 43 Pop culture 43 Business 43 Computers and technology 42 Energy drinks 41 Health and medicine 40 Education 40 Arts and culture 40 Downloading music and movies 39 Science 39 Search for favorite websites 39 Internet for school 37 Targeted Internet search 35 Sports – days in a week 34 Reading the news 34 Internet Use (hours per week) 34 Coffee 33 Politics 33 Music 32 Sports 30 Everyday FB use 29 Attitude about time on the Internet 27 Surfing 24 Economic status 23 Internet Use (in holiday) 23 Smoker 22 Alcohol 21 Social networks 20 Achievement 10 Gender 2 FB use – visiting groups 0 FB use – chatting 0 FB use –gaming 0 FB use - sharing music, photos etc. 0 Drepressive temperament 0 Cyclothymic temperament 0 Hyperthymic temperament 0 Irritable temperament 0 Anxiety temperament 0 FB use - publishing statuses 0 FB use - reading posts 0 dtype: int64
In [5]:
data = preprocess.process_standardization(data)
data.head
Out[5]:
<bound method NDFrame.head of Gender Achievement Economic status Internet Use (in years) \
0 0.0 -1.366523 -0.389762 1.528074
1 0.0 -0.145708 0.564106 -0.237632
2 0.0 -0.145708 -0.389762 1.528074
3 0.0 -0.145708 -0.389762 0.645221
4 0.0 -0.145708 -2.297499 -1.561912
... ... ... ... ...
2108 0.0 -0.145708 -2.297499 -1.561912
2109 0.0 1.075106 -0.389762 -0.237632
2110 0.0 -1.366523 -0.389762 -0.237632
2111 0.0 -0.145708 -2.297499 -1.120486
2112 0.0 -1.366523 0.564106 -1.120486
Internet Use (hours per week) Internet Use (hours per day) \
0 1.0 0.646391
1 0.0 -0.703951
2 1.0 0.646391
3 0.0 -0.703951
4 0.0 -0.703951
... ... ...
2108 1.0 0.646391
2109 1.0 -0.703951
2110 1.0 -0.703951
2111 1.0 -0.703951
2112 1.0 -0.703951
Internet Use (in holiday) Attitude about time on the Internet \
0 1.272123 0.0
1 1.272123 0.0
2 -0.141347 0.0
3 -0.141347 0.0
4 1.272123 0.0
... ... ...
2108 1.272123 0.0
2109 1.272123 0.0
2110 -0.141347 0.0
2111 -0.141347 1.0
2112 1.272123 0.0
Politics Business ... Coffee Alcohol Drepressive temperament \
0 3.554807 -0.685844 ... 1.0 1.0 2.595336
1 0.367281 -0.685844 ... 0.0 0.0 0.217216
2 0.367281 1.335077 ... 0.0 0.0 -0.377314
3 -0.695228 0.324616 ... 0.0 0.0 0.217216
4 -0.695228 1.335077 ... NaN 0.0 0.811746
... ... ... ... ... ... ...
2108 -0.695228 -0.685844 ... 0.0 1.0 2.000806
2109 -0.695228 -0.685844 ... 0.0 0.0 -0.377314
2110 -0.695228 -0.685844 ... 0.0 1.0 -0.971844
2111 0.367281 3.355997 ... 0.0 1.0 1.406276
2112 0.367281 -0.685844 ... 0.0 1.0 -0.971844
Cyclothymic temperament Hyperthymic temperament Irritable temperament \
0 0.186837 -1.031386 0.273212
1 1.193945 0.567918 0.273212
2 -1.323826 1.101019 -1.171234
3 -1.323826 1.101019 0.754694
4 -0.316717 -0.498284 1.236176
... ... ... ...
2108 -0.316717 1.101019 0.754694
2109 -1.323826 0.567918 -1.171234
2110 -1.827380 1.101019 -0.689752
2111 0.690391 1.101019 1.236176
2112 -0.820272 0.034817 -0.208270
Anxiety temperament PIU PIUcutoff Cutoff_Class
0 1.139434 NaN NaN NaN
1 0.641477 25.0 0.0 0.0
2 -1.350353 28.0 0.0 0.0
3 0.143519 31.0 0.0 0.0
4 1.139434 28.0 0.0 0.0
... ... ... ... ...
2108 -0.354438 46.0 1.0 1.0
2109 -0.852396 30.0 0.0 0.0
2110 -1.350353 39.0 1.0 0.0
2111 0.143519 39.0 1.0 0.0
2112 -1.350353 18.0 0.0 0.0
[2113 rows x 59 columns]>
In [6]:
# process columns with NaN values
data = preprocess.process_columns_with_nan_values(data)
# check again columns with NaN values
print(data.isna().sum().sort_values(ascending=False))
Gender 0 Sports – intensity 0 Downloading music and movies 0 Internet for school 0 Online courses 0 Everyday FB use 0 Average time spent on FB 0 FB use - reading posts 0 FB use - publishing statuses 0 FB use - sharing music, photos etc. 0 FB use –gaming 0 FB use – chatting 0 FB use – visiting groups 0 Sports – days in a week 0 Sports – in minutes 0 Online games 0 Energy drinks 0 Fast Food 0 Smoker 0 Coffee 0 Alcohol 0 Drepressive temperament 0 Cyclothymic temperament 0 Hyperthymic temperament 0 Irritable temperament 0 Anxiety temperament 0 PIU 0 PIUcutoff 0 Reading and downloading books and texts 0 Reading the news 0 Achievement 0 Pop culture 0 Economic status 0 Internet Use (in years) 0 Internet Use (hours per week) 0 Internet Use (hours per day) 0 Internet Use (in holiday) 0 Attitude about time on the Internet 0 Politics 0 Business 0 Sports 0 Computers and technology 0 Arts and culture 0 Education 0 Pornography 0 Search for favorite websites 0 Music 0 Travel/tourism 0 Health and medicine 0 Science 0 Religion 0 Communication by e-mail 0 Social networks 0 Communication on the forum 0 Communication on the blog 0 Targeted Internet search 0 Surfing 0 Expert Advice 0 Cutoff_Class 0 dtype: int64
In [7]:
data = preprocess.process_outliers(data)
Original number of rows: 2009 Number of normal rows detected: 1908 Number of outliers detected: 101 Number of rows after eliminating outliers: 1908
In [8]:
# predictors and class
x_data = data[data.columns.difference(['Cutoff_Class', 'PIUcutoff', 'PIU' ])]
y_data = data['Cutoff_Class']
# split dataset to training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.05, random_state=1234, shuffle=True) #, stratify=y_data)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
In [9]:
# prepare data frame to count top features
df_top_features = pd.DataFrame({'attr_names' : x_train.columns.values})
df_top_features['top_count'] = 0
df_top_features['top_count_xai'] = 0
df_top_features['top_count_selection'] = 0
In [10]:
df = pd.DataFrame({'Cutoff_Class':y_train})
visualize.show_value_distribution_per_column(df, 'Cutoff_Class')
In [11]:
# oversample data
oversampler = SMOTE()
x_train, y_train = oversampler.fit_resample(x_train, y_train)
df = pd.DataFrame({'Cutoff_Class':y_train})
visualize.show_value_distribution_per_column(df, 'Cutoff_Class')
In [12]:
k_features = selection.select_k_best_features(x_train, y_train, 0)
visualize.show_selected_features(
title = 'Feature selection using univariate statistical test (ANOVA F-value)',
label_x = 'Feature score',
label_y = 'Feature name',
data_x = k_features['values'],
data_y = k_features['attr_names']
)
df_top_features = relevance.add_top_features_count('select_k_best', df_top_features, k_features, 10)
In [13]:
trained_models = dict()
parameters = dict()
# hyperparameters for decision tree classifier
parameters['decision_tree'] = dict(
max_leaf_nodes = list(range(2,10)),
min_samples_split = [2,3,4],
max_depth = [2,3,5,10],
criterion = ['gini', 'entropy']
)
trained_models['decision_tree'] = DecisionTreeClassifier()
# hyperparameters for random forest classifier
parameters['random_forest'] = dict(
n_estimators = [10,20,40],
criterion = ['gini', 'entropy', 'log_loss'],
# max_features = list(range(3,8)),
max_depth = [2,4,9]
)
trained_models['random_forest'] = RandomForestClassifier()
# hyperparameters for adaboost classifier
parameters['adaboost'] = dict(
n_estimators = [10,20,40],
learning_rate = [0.01,0.1,1],
algorithm = ['SAMME', 'SAMME.R']
)
trained_models['adaboost'] = AdaBoostClassifier()
# hyperparameters for xgboost classifier
parameters['xgboost'] = dict(
max_depth = [1,3,5,7,9,11],
learning_rate = [0.01,0.1,1,10,100],
subsample = [0.5, 0.7, 1],
n_estimators = [5,50,70]
)
trained_models['xgboost'] = XGBClassifier()
#hyperparameters for bagging classifier
parameters['bagging'] = dict(
#n_estimators = [300, 400, 500, 600, 700, 800],
n_estimators = [300, 600],
# max_features = [0.90, 0.92, 0.95, 1.0],
bootstrap = [True, False],
bootstrap_features = [True, False],
)
trained_models['bagging'] = BaggingClassifier()
# hyperparameters for gradient boosting classifier
parameters['gradient_boosting'] = dict(
n_estimators = [5,50,70],
max_depth = [1,3,5,7,9,11],
learning_rate = [0.01,0.1,1,10,100],
#loss = ['log_loss', 'exponential'],
#criterion = ['friedman_mse', 'squared_error']
)
trained_models['gradient_boosting'] = GradientBoostingClassifier()
# train classifier using GridSearchCV for each of the selected models
# print("Not optimizing parameters every time")
# uncomment only to recheck hyperparameter optimization
# for m in trained_models:
# trained_models[m] = selection.train_best_classifier(m, trained_models[m], parameters[m], x_train, y_train, x_test, y_test)
# instead create models with optimized parameters
trained_models = dict()
trained_models['decision_tree'] = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=7)
trained_models['random_forest'] = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=40)
trained_models['adaboost'] = AdaBoostClassifier(learning_rate=1, n_estimators=40)
trained_models['xgboost'] = XGBClassifier(max_depth=9, learning_rate=0.1, n_estimators=50)
trained_models['bagging'] = BaggingClassifier(bootstrap_features=False, bootstrap=True, n_estimators=600)
trained_models['gradient_boost'] = GradientBoostingClassifier(learning_rate=1, max_depth=11, n_estimators=70)
In [14]:
# compute classification metrics for every classification model
df_models = selection.calculate_classifier_metrics(trained_models, x_train, y_train)
print(df_models)
Model name Accuracy Precision Recall F1 0 decision_tree 0.722 0.782 0.620 0.690 1 random_forest 0.782 0.800 0.752 0.774 2 adaboost 0.771 0.767 0.779 0.773 3 xgboost 0.794 0.802 0.781 0.790 4 bagging 0.808 0.826 0.781 0.802 5 gradient_boost 0.800 0.814 0.781 0.796
In [15]:
# show metrics for trained models
visualize.show_model_comparasion(df_models)
In [16]:
# fit all models with train data
trained_models = dict()
trained_models['decision_tree'] = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=7)
trained_models['random_forest'] = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=40)
trained_models['adaboost'] = AdaBoostClassifier(learning_rate=1, n_estimators=40)
trained_models['xgboost'] = XGBClassifier(max_depth=9, learning_rate=0.1, n_estimators=50)
for m in trained_models:
trained_models[m].fit(x_train, y_train)
df_features = selection.get_feature_importances(trained_models, x_train, y_train)
visualize.show_selected_features(
title = 'Feature selection - Tree models',
label_x = 'Feature score',
label_y = 'Feature name',
data_x = df_features['values'],
data_y = df_features['attr_names']
)
df_top_features = relevance.add_top_features_count('tree_importance', df_top_features, df_features, 10)
In [17]:
# show features selected by LassoCV
df_features = selection.get_feature_scores_lasso(x_train, y_train)
visualize.show_selected_features(
title = 'Feature selection - LassoCV',
label_x = 'Feature score',
label_y = 'Feature name',
data_x = df_features['values'],
data_y = df_features['attr_names']
)
df_top_features = relevance.add_top_features_count('lasso', df_top_features, df_features, 10)
In [18]:
# show features selected by ElasticNET
df_features = selection.get_feature_scores_elenet(x_train, y_train)
visualize.show_selected_features(
title = 'Feature selection - ElasticNET',
label_x = 'Feature score',
label_y = 'Feature name',
data_x = df_features['values'],
data_y = df_features['attr_names']
)
df_top_features = relevance.add_top_features_count('elenet', df_top_features, df_features, 10)
In [19]:
# fit one representative model
predictions = relevance.get_predictions('random_forest', x_train, y_train, x_test, y_test)
Train accuracy: 0.97
Test accuracy: 0.74
Confusion Matrix:
[[45 10]
[15 26]]
Classification Report:
precision recall f1-score support
0.0 0.75 0.82 0.78 55
1.0 0.72 0.63 0.68 41
accuracy 0.74 96
macro avg 0.74 0.73 0.73 96
weighted avg 0.74 0.74 0.74 96
In [20]:
# feature selection using genetic algorithms
selector = GeneticSelectionCV(
estimator=predictions['model'],
cv=10,
scoring="accuracy",
n_population=20, #50,
n_generations=5, #40,
n_jobs=10,
verbose=False
)
selector.fit(x_train, y_train)
visualize.show_genetic_selection_results(selector, x_train)
features is_used 0 Achievement True 1 Alcohol True 2 Anxiety temperament True 3 Arts and culture False 4 Attitude about time on the Internet True 5 Average time spent on FB True 6 Business True 7 Coffee True 8 Communication by e-mail True 9 Communication on the blog True 10 Communication on the forum True 11 Computers and technology False 12 Cyclothymic temperament True 13 Downloading music and movies True 14 Drepressive temperament True 15 Economic status True 16 Education True 17 Energy drinks True 18 Everyday FB use True 19 Expert Advice True 20 FB use - publishing statuses True 21 FB use - reading posts True 22 FB use - sharing music, photos etc. True 23 FB use – chatting True 24 FB use – visiting groups True 25 FB use –gaming True 26 Fast Food True 27 Gender True 28 Health and medicine True 29 Hyperthymic temperament True 30 Internet Use (hours per day) False 31 Internet Use (hours per week) True 32 Internet Use (in holiday) True 33 Internet Use (in years) True 34 Internet for school False 35 Irritable temperament True 36 Music True 37 Online courses True 38 Online games True 39 Politics True 40 Pop culture True 41 Pornography False 42 Reading and downloading books and texts True 43 Reading the news True 44 Religion True 45 Science True 46 Search for favorite websites True 47 Smoker False 48 Social networks True 49 Sports False 50 Sports – days in a week True 51 Sports – in minutes True 52 Sports – intensity True 53 Surfing True 54 Targeted Internet search True 55 Travel/tourism False
In [21]:
# add top features statistics
df_features = pd.DataFrame({'attr_names': x_train.columns.values, 'values':selector.support_})
df_top_features = relevance.add_top_features_count('genetic_selection', df_top_features, df_features, 10)
In [22]:
# fit one representative model and get it's predictions
predictions = relevance.get_predictions('random_forest', x_train, y_train, x_test, y_test)
Train accuracy: 0.97
Test accuracy: 0.74
Confusion Matrix:
[[46 9]
[16 25]]
Classification Report:
precision recall f1-score support
0.0 0.74 0.84 0.79 55
1.0 0.74 0.61 0.67 41
accuracy 0.74 96
macro avg 0.74 0.72 0.73 96
weighted avg 0.74 0.74 0.74 96
In [23]:
# use LIME to explain results for a single prediction
explainer = lime.lime_tabular.LimeTabularExplainer(
x_train.values,
feature_names=x_test.columns.values.tolist(),
class_names=['PIU no', 'PIU yes'],
verbose=False,
mode='classification'
)
predict_fn = lambda x: predictions['model'].predict_proba(x)
print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
x_test.loc[i, x_test.columns.values.tolist()].astype(int).values,
predict_fn,
num_features=10
)
map = exp.as_map()
print(map)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])
print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
x_test.loc[i, x_test.columns.values.tolist()].astype(int).values,
predict_fn,
num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])
print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
x_test.loc[i, x_test.columns.values.tolist()].astype(int).values,
predict_fn,
num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])
print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
x_test.loc[i, x_test.columns.values.tolist()].astype(int).values,
predict_fn,
num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])
True positive predictions.
Expected PIU value: 1.0.
Predicted PIU value: 1.0.
{1: [(31, 0.11128951703580371), (4, -0.07341759951466524), (3, -0.05341176289759748), (5, 0.05169847495873944), (48, 0.046315630954042925), (12, 0.03946902747313003), (17, 0.03157528748484642), (46, -0.027174525064780956), (21, -0.024469859196085277), (1, -0.02371051659455407)]}
True negative predictions. Expected PIU value: 0.0. Predicted PIU value: 0.0.
False positive predictions. Expected PIU value: 0.0. Predicted PIU value: 1.0.
False negative predictions. Expected PIU value: 1.0. Predicted PIU value: 0.0.
In [24]:
# Let's use SP-LIME to return explanations on a sample data set
# and obtain a non-redundant global decision perspective of the black-box model
sp_exp = submodular_pick.SubmodularPick(explainer,
x_test[x_test.columns.values.tolist()].values,
predict_fn,
num_features=5,
num_exps_desired=5
)
[exp.show_in_notebook() for exp in sp_exp.sp_explanations]
print('SP-LIME Explanations.')
[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_exp.sp_explanations]
print('SP-LIME Local Explanations')
SP-LIME Explanations. SP-LIME Local Explanations
In [25]:
# creating SHAP explainer
explainer = shap.TreeExplainer(predictions['model'])
shap_values = explainer.shap_values(x_train)
# show summary plot of global features relevance
shap.initjs()
shap.summary_plot(shap_values, x_train)
In [26]:
# add top features statistics
df = relevance.get_shap_values_as_data_frame(x_train, shap_values)
print(df)
df_top_features = relevance.add_top_features_count('SHAP', df_top_features, k_features, 10)
df = relevance.get_shap_values_as_data_frame(x_train, shap_values, 0)
print(df)
df = relevance.get_shap_values_as_data_frame(x_train, shap_values, 1)
print(df)
attr_names values
1 Average time spent on FB 0.118057
2 Internet Use (hours per week) 0.113004
3 Cyclothymic temperament 0.102898
4 Arts and culture 0.067168
5 Irritable temperament 0.051360
6 Internet Use (hours per day) 0.049661
7 Attitude about time on the Internet 0.049350
8 Surfing 0.038018
9 Online games 0.037523
10 Social networks 0.035075
11 Achievement 0.030403
12 Search for favorite websites 0.028973
13 Anxiety temperament 0.028000
14 Everyday FB use 0.025982
15 Pornography 0.025213
16 Energy drinks 0.024699
17 Drepressive temperament 0.022546
18 Computers and technology 0.019394
19 Hyperthymic temperament 0.016688
20 Expert Advice 0.016658
21 Internet Use (in holiday) 0.015385
22 Communication on the forum 0.012028
23 Sports – days in a week 0.011033
24 Alcohol 0.010773
25 Internet Use (in years) 0.010741
26 Health and medicine 0.010317
27 Education 0.010009
28 Communication on the blog 0.009742
29 Gender 0.009558
30 Reading the news 0.009292
31 Reading and downloading books and texts 0.009183
32 FB use – chatting 0.008665
33 Sports – intensity 0.008551
34 Online courses 0.008364
35 FB use - reading posts 0.008040
36 Pop culture 0.007248
37 Internet for school 0.006577
38 Sports – in minutes 0.006416
39 Economic status 0.006410
40 Targeted Internet search 0.006394
41 Business 0.006302
42 Downloading music and movies 0.005368
43 Travel/tourism 0.005223
44 Religion 0.005200
45 Sports 0.004869
46 Science 0.004594
47 Communication by e-mail 0.004344
48 Politics 0.004127
49 Coffee 0.004120
50 FB use – visiting groups 0.003894
51 FB use - sharing music, photos etc. 0.003178
52 FB use - publishing statuses 0.002496
53 Fast Food 0.002167
54 Music 0.001923
55 Smoker 0.001714
56 FB use –gaming 0.001160
attr_names values
1 Average time spent on FB 0.059029
2 Internet Use (hours per week) 0.056502
3 Cyclothymic temperament 0.051449
4 Arts and culture 0.033584
5 Irritable temperament 0.025680
6 Internet Use (hours per day) 0.024831
7 Attitude about time on the Internet 0.024675
8 Surfing 0.019009
9 Online games 0.018762
10 Social networks 0.017538
11 Achievement 0.015201
12 Search for favorite websites 0.014486
13 Anxiety temperament 0.014000
14 Everyday FB use 0.012991
15 Pornography 0.012606
16 Energy drinks 0.012349
17 Drepressive temperament 0.011273
18 Computers and technology 0.009697
19 Hyperthymic temperament 0.008344
20 Expert Advice 0.008329
21 Internet Use (in holiday) 0.007692
22 Communication on the forum 0.006014
23 Sports – days in a week 0.005517
24 Alcohol 0.005386
25 Internet Use (in years) 0.005371
26 Health and medicine 0.005158
27 Education 0.005004
28 Communication on the blog 0.004871
29 Gender 0.004779
30 Reading the news 0.004646
31 Reading and downloading books and texts 0.004592
32 FB use – chatting 0.004332
33 Sports – intensity 0.004276
34 Online courses 0.004182
35 FB use - reading posts 0.004020
36 Pop culture 0.003624
37 Internet for school 0.003288
38 Sports – in minutes 0.003208
39 Economic status 0.003205
40 Targeted Internet search 0.003197
41 Business 0.003151
42 Downloading music and movies 0.002684
43 Travel/tourism 0.002612
44 Religion 0.002600
45 Sports 0.002434
46 Science 0.002297
47 Communication by e-mail 0.002172
48 Politics 0.002063
49 Coffee 0.002060
50 FB use – visiting groups 0.001947
51 FB use - sharing music, photos etc. 0.001589
52 FB use - publishing statuses 0.001248
53 Fast Food 0.001083
54 Music 0.000961
55 Smoker 0.000857
56 FB use –gaming 0.000580
attr_names values
1 Average time spent on FB 0.059029
2 Internet Use (hours per week) 0.056502
3 Cyclothymic temperament 0.051449
4 Arts and culture 0.033584
5 Irritable temperament 0.025680
6 Internet Use (hours per day) 0.024831
7 Attitude about time on the Internet 0.024675
8 Surfing 0.019009
9 Online games 0.018762
10 Social networks 0.017538
11 Achievement 0.015201
12 Search for favorite websites 0.014486
13 Anxiety temperament 0.014000
14 Everyday FB use 0.012991
15 Pornography 0.012606
16 Energy drinks 0.012349
17 Drepressive temperament 0.011273
18 Computers and technology 0.009697
19 Hyperthymic temperament 0.008344
20 Expert Advice 0.008329
21 Internet Use (in holiday) 0.007692
22 Communication on the forum 0.006014
23 Sports – days in a week 0.005517
24 Alcohol 0.005386
25 Internet Use (in years) 0.005371
26 Health and medicine 0.005158
27 Education 0.005004
28 Communication on the blog 0.004871
29 Gender 0.004779
30 Reading the news 0.004646
31 Reading and downloading books and texts 0.004592
32 FB use – chatting 0.004332
33 Sports – intensity 0.004276
34 Online courses 0.004182
35 FB use - reading posts 0.004020
36 Pop culture 0.003624
37 Internet for school 0.003288
38 Sports – in minutes 0.003208
39 Economic status 0.003205
40 Targeted Internet search 0.003197
41 Business 0.003151
42 Downloading music and movies 0.002684
43 Travel/tourism 0.002612
44 Religion 0.002600
45 Sports 0.002434
46 Science 0.002297
47 Communication by e-mail 0.002172
48 Politics 0.002063
49 Coffee 0.002060
50 FB use – visiting groups 0.001947
51 FB use - sharing music, photos etc. 0.001589
52 FB use - publishing statuses 0.001248
53 Fast Food 0.001083
54 Music 0.000961
55 Smoker 0.000857
56 FB use –gaming 0.000580
In [27]:
# plot explanations for single instance predictions
print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], x_test.iloc[i,:])
True positive predictions. Expected PIU value: 1.0. Predicted PIU value: 1.0.
Out[27]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [28]:
print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[i,:])
True negative predictions. Expected PIU value: 0.0. Predicted PIU value: 0.0.
Out[28]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [29]:
print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[i,:])
False positive predictions. Expected PIU value: 0.0. Predicted PIU value: 1.0.
Out[29]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [30]:
print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], x_test.iloc[i,:])
False negative predictions. Expected PIU value: 1.0. Predicted PIU value: 0.0.
Out[30]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [31]:
shap.initjs()
shap.decision_plot(explainer.expected_value[1], shap_values[1][0,:], x_train.iloc[3,:])
In [32]:
# use eli5 (permutation feature importance) to show global feature relevance
show_weights(
predictions['model'],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_train.columns.values.tolist(),
top=len(x_train.columns)+1
)
Out[32]:
| Weight | Feature |
|---|---|
| 0.0774 ± 0.0903 | Average time spent on FB |
| 0.0651 ± 0.0584 | Cyclothymic temperament |
| 0.0578 ± 0.0813 | Internet Use (hours per week) |
| 0.0366 ± 0.0359 | Irritable temperament |
| 0.0344 ± 0.0477 | Arts and culture |
| 0.0281 ± 0.0452 | Online games |
| 0.0280 ± 0.0537 | Attitude about time on the Internet |
| 0.0274 ± 0.0299 | Anxiety temperament |
| 0.0272 ± 0.0305 | Surfing |
| 0.0252 ± 0.0425 | Internet Use (hours per day) |
| 0.0227 ± 0.0476 | Social networks |
| 0.0220 ± 0.0380 | Search for favorite websites |
| 0.0217 ± 0.0351 | Drepressive temperament |
| 0.0205 ± 0.0188 | Sports – days in a week |
| 0.0203 ± 0.0231 | Pornography |
| 0.0203 ± 0.0205 | Internet Use (in years) |
| 0.0198 ± 0.0218 | Hyperthymic temperament |
| 0.0191 ± 0.0184 | Reading the news |
| 0.0188 ± 0.0287 | Achievement |
| 0.0177 ± 0.0197 | Sports – in minutes |
| 0.0175 ± 0.0247 | Computers and technology |
| 0.0170 ± 0.0527 | Energy drinks |
| 0.0163 ± 0.0289 | Everyday FB use |
| 0.0163 ± 0.0197 | Communication on the blog |
| 0.0159 ± 0.0209 | Reading and downloading books and texts |
| 0.0154 ± 0.0229 | Health and medicine |
| 0.0153 ± 0.0155 | Economic status |
| 0.0151 ± 0.0209 | Internet Use (in holiday) |
| 0.0145 ± 0.0168 | Pop culture |
| 0.0135 ± 0.0165 | Internet for school |
| 0.0134 ± 0.0168 | Education |
| 0.0133 ± 0.0197 | Expert Advice |
| 0.0124 ± 0.0163 | Communication by e-mail |
| 0.0123 ± 0.0154 | Religion |
| 0.0122 ± 0.0159 | Science |
| 0.0121 ± 0.0191 | Travel/tourism |
| 0.0120 ± 0.0152 | Downloading music and movies |
| 0.0119 ± 0.0135 | Sports – intensity |
| 0.0118 ± 0.0223 | Communication on the forum |
| 0.0115 ± 0.0173 | Sports |
| 0.0112 ± 0.0187 | Business |
| 0.0107 ± 0.0217 | FB use - reading posts |
| 0.0105 ± 0.0145 | Online courses |
| 0.0099 ± 0.0142 | Targeted Internet search |
| 0.0093 ± 0.0145 | Politics |
| 0.0086 ± 0.0124 | Alcohol |
| 0.0076 ± 0.0216 | FB use – chatting |
| 0.0070 ± 0.0111 | Gender |
| 0.0064 ± 0.0106 | Coffee |
| 0.0062 ± 0.0110 | FB use – visiting groups |
| 0.0048 ± 0.0098 | FB use - sharing music, photos etc. |
| 0.0047 ± 0.0101 | FB use - publishing statuses |
| 0.0040 ± 0.0095 | Music |
| 0.0036 ± 0.0071 | Fast Food |
| 0.0030 ± 0.0064 | Smoker |
| 0.0024 ± 0.0056 | FB use –gaming |
In [33]:
# add top features statistics
df = explain_weights_dfs(
predictions['model'],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_train.columns.values.tolist(),
top=len(x_train.columns)+1
)
df = pd.DataFrame({'attr_names' : df['feature_importances'].iloc[:, 0], 'values' : df['feature_importances'].iloc[:, 1]})
df_top_features = relevance.add_top_features_count('eli5', df_top_features, k_features, 10)
In [34]:
# plot explanations for single instance predictions
print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
show_prediction(
predictions['model'],
x_test.iloc[i,:],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_test.columns.values.tolist(),
top=(10, 10)
)
True positive predictions. Expected PIU value: 1.0. Predicted PIU value: 1.0.
Out[34]:
y=PIU yes (probability 0.735) top features
| Contribution? | Feature |
|---|---|
| +0.502 | <BIAS> |
| +0.081 | Internet Use (hours per week) |
| +0.072 | Average time spent on FB |
| +0.026 | Internet Use (hours per day) |
| +0.022 | Health and medicine |
| +0.018 | Surfing |
| +0.017 | Search for favorite websites |
| +0.016 | Anxiety temperament |
| +0.015 | Pornography |
| +0.015 | Social networks |
| … 23 more positive … | |
| … 9 more negative … | |
| -0.005 | FB use – visiting groups |
| -0.006 | Alcohol |
| -0.007 | Sports |
| -0.008 | Gender |
| -0.014 | Irritable temperament |
| -0.014 | Drepressive temperament |
| -0.016 | Religion |
| -0.016 | Business |
| -0.024 | Attitude about time on the Internet |
| -0.033 | Arts and culture |
In [35]:
print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
show_prediction(
predictions['model'],
x_test.iloc[i,:],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_test.columns.values.tolist(),
top=len(x_test.columns)+1
)
True negative predictions. Expected PIU value: 0.0. Predicted PIU value: 0.0.
Out[35]:
y=PIU no (probability 0.795) top features
| Contribution? | Feature |
|---|---|
| +0.498 | <BIAS> |
| +0.068 | Cyclothymic temperament |
| +0.050 | Average time spent on FB |
| +0.039 | Social networks |
| +0.028 | Internet Use (hours per week) |
| +0.024 | Online games |
| +0.024 | Internet Use (hours per day) |
| +0.020 | Arts and culture |
| +0.020 | Achievement |
| +0.018 | Energy drinks |
| +0.016 | Gender |
| +0.016 | Drepressive temperament |
| +0.013 | Internet Use (in years) |
| +0.011 | Computers and technology |
| +0.010 | Search for favorite websites |
| +0.009 | Pornography |
| +0.008 | Attitude about time on the Internet |
| +0.008 | Communication by e-mail |
| +0.007 | Religion |
| +0.007 | Communication on the blog |
| +0.004 | Expert Advice |
| +0.004 | Health and medicine |
| +0.004 | Education |
| +0.004 | FB use – chatting |
| +0.003 | Sports |
| +0.002 | FB use - sharing music, photos etc. |
| +0.002 | Downloading music and movies |
| +0.001 | Sports – in minutes |
| +0.001 | FB use - reading posts |
| +0.001 | Anxiety temperament |
| +0.001 | Economic status |
| +0.000 | Politics |
| +0.000 | FB use –gaming |
| +0.000 | Science |
| -0.000 | Music |
| -0.000 | Business |
| -0.001 | Surfing |
| -0.001 | Internet for school |
| -0.001 | Reading and downloading books and texts |
| -0.001 | Travel/tourism |
| -0.001 | Online courses |
| -0.002 | Alcohol |
| -0.002 | Coffee |
| -0.003 | Communication on the forum |
| -0.004 | Pop culture |
| -0.005 | Targeted Internet search |
| -0.006 | Irritable temperament |
| -0.008 | Everyday FB use |
| -0.008 | Reading the news |
| -0.013 | Sports – intensity |
| -0.018 | Hyperthymic temperament |
| -0.022 | Sports – days in a week |
| -0.029 | Internet Use (in holiday) |
In [36]:
print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
show_prediction(
predictions['model'],
x_test.iloc[i,:],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_test.columns.values.tolist(),
top=len(x_test.columns)+1
)
False positive predictions. Expected PIU value: 0.0. Predicted PIU value: 1.0.
Out[36]:
y=PIU yes (probability 0.654) top features
| Contribution? | Feature |
|---|---|
| +0.502 | <BIAS> |
| +0.098 | Cyclothymic temperament |
| +0.079 | Average time spent on FB |
| +0.069 | Internet Use (hours per week) |
| +0.050 | Irritable temperament |
| +0.021 | Online games |
| +0.019 | Arts and culture |
| +0.017 | Travel/tourism |
| +0.016 | Achievement |
| +0.014 | Targeted Internet search |
| +0.011 | Hyperthymic temperament |
| +0.011 | Social networks |
| +0.009 | Science |
| +0.008 | Internet for school |
| +0.006 | FB use - sharing music, photos etc. |
| +0.006 | FB use – chatting |
| +0.006 | Everyday FB use |
| +0.004 | Communication by e-mail |
| +0.004 | Education |
| +0.003 | Anxiety temperament |
| +0.003 | Music |
| +0.002 | FB use - reading posts |
| +0.002 | Downloading music and movies |
| +0.001 | Communication on the blog |
| +0.001 | Fast Food |
| -0.000 | Internet Use (in years) |
| -0.001 | FB use –gaming |
| -0.001 | Business |
| -0.001 | Health and medicine |
| -0.002 | Drepressive temperament |
| -0.003 | Expert Advice |
| -0.004 | Politics |
| -0.004 | Online courses |
| -0.005 | Alcohol |
| -0.005 | Sports |
| -0.006 | Reading the news |
| -0.006 | FB use - publishing statuses |
| -0.006 | Computers and technology |
| -0.006 | Pop culture |
| -0.008 | Religion |
| -0.008 | Economic status |
| -0.008 | Communication on the forum |
| -0.009 | Pornography |
| -0.011 | Sports – in minutes |
| -0.012 | Internet Use (in holiday) |
| -0.013 | Search for favorite websites |
| -0.015 | Sports – days in a week |
| -0.019 | Internet Use (hours per day) |
| -0.021 | Attitude about time on the Internet |
| -0.021 | Smoker |
| -0.021 | Energy drinks |
| -0.024 | Sports – intensity |
| -0.026 | Reading and downloading books and texts |
| -0.039 | Surfing |
In [37]:
print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
show_prediction(
predictions['model'],
x_test.iloc[i,:],
targets=[0, 1],
target_names=['PIU no', 'PIU yes'],
feature_names=x_test.columns.values.tolist(),
top=len(x_test.columns)+1
)
False negative predictions. Expected PIU value: 1.0. Predicted PIU value: 0.0.
Out[37]:
y=PIU no (probability 0.562) top features
| Contribution? | Feature |
|---|---|
| +0.498 | <BIAS> |
| +0.073 | Internet Use (hours per week) |
| +0.053 | Social networks |
| +0.022 | Internet Use (hours per day) |
| +0.020 | Attitude about time on the Internet |
| +0.018 | Online games |
| +0.016 | Music |
| +0.015 | Communication on the blog |
| +0.012 | Computers and technology |
| +0.012 | Reading the news |
| +0.008 | Irritable temperament |
| +0.008 | Hyperthymic temperament |
| +0.007 | FB use - sharing music, photos etc. |
| +0.007 | Online courses |
| +0.007 | Reading and downloading books and texts |
| +0.007 | Anxiety temperament |
| +0.006 | FB use - reading posts |
| +0.006 | Internet Use (in holiday) |
| +0.005 | Communication by e-mail |
| +0.005 | Science |
| +0.005 | Search for favorite websites |
| +0.004 | Internet Use (in years) |
| +0.003 | FB use – visiting groups |
| +0.003 | Pop culture |
| +0.003 | Politics |
| +0.001 | Travel/tourism |
| +0.001 | Business |
| +0.000 | Religion |
| +0.000 | Sports – in minutes |
| -0.000 | Internet for school |
| -0.000 | Surfing |
| -0.000 | Drepressive temperament |
| -0.001 | FB use - publishing statuses |
| -0.002 | Expert Advice |
| -0.002 | Targeted Internet search |
| -0.003 | Fast Food |
| -0.003 | Sports – intensity |
| -0.004 | Gender |
| -0.004 | Economic status |
| -0.004 | FB use – chatting |
| -0.004 | Coffee |
| -0.004 | Downloading music and movies |
| -0.005 | Alcohol |
| -0.008 | Education |
| -0.012 | Communication on the forum |
| -0.013 | Sports – days in a week |
| -0.015 | Energy drinks |
| -0.016 | Pornography |
| -0.016 | Health and medicine |
| -0.017 | Everyday FB use |
| -0.017 | Achievement |
| -0.031 | Arts and culture |
| -0.039 | Average time spent on FB |
| -0.042 | Cyclothymic temperament |
In [38]:
# show statistics for features in top_n features of different methods
df_top_features = df_top_features.sort_values(by=['top_count_xai'],ascending=False)
print(df_top_features)
attr_names top_count top_count_xai \
31 Internet Use (hours per week) 6 3
4 Attitude about time on the Internet 6 3
5 Average time spent on FB 6 3
53 Surfing 4 3
18 Everyday FB use 6 3
35 Irritable temperament 4 3
12 Cyclothymic temperament 6 3
30 Internet Use (hours per day) 4 2
48 Social networks 3 2
38 Online games 3 2
2 Anxiety temperament 3 1
3 Arts and culture 4 1
17 Energy drinks 1 1
39 Politics 0 0
40 Pop culture 0 0
36 Music 0 0
37 Online courses 0 0
42 Reading and downloading books and texts 0 0
34 Internet for school 1 0
41 Pornography 1 0
0 Achievement 2 0
43 Reading the news 0 0
44 Religion 0 0
33 Internet Use (in years) 0 0
46 Search for favorite websites 0 0
47 Smoker 3 0
49 Sports 1 0
50 Sports – days in a week 0 0
51 Sports – in minutes 0 0
52 Sports – intensity 0 0
54 Targeted Internet search 0 0
45 Science 0 0
28 Health and medicine 0 0
32 Internet Use (in holiday) 0 0
16 Education 0 0
6 Business 0 0
7 Coffee 0 0
8 Communication by e-mail 0 0
9 Communication on the blog 0 0
10 Communication on the forum 0 0
11 Computers and technology 1 0
13 Downloading music and movies 0 0
14 Drepressive temperament 0 0
15 Economic status 0 0
19 Expert Advice 0 0
29 Hyperthymic temperament 2 0
20 FB use - publishing statuses 0 0
21 FB use - reading posts 0 0
22 FB use - sharing music, photos etc. 0 0
23 FB use – chatting 0 0
24 FB use – visiting groups 0 0
25 FB use –gaming 0 0
26 Fast Food 0 0
27 Gender 0 0
1 Alcohol 0 0
55 Travel/tourism 1 0
top_count_selection
31 3
4 3
5 3
53 1
18 3
35 1
12 3
30 2
48 1
38 1
2 2
3 3
17 0
39 0
40 0
36 0
37 0
42 0
34 1
41 1
0 2
43 0
44 0
33 0
46 0
47 3
49 1
50 0
51 0
52 0
54 0
45 0
28 0
32 0
16 0
6 0
7 0
8 0
9 0
10 0
11 1
13 0
14 0
15 0
19 0
29 2
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
1 0
55 1
In [39]:
df_top_features = df_top_features.sort_values(by=['top_count'],ascending=False)
print(df_top_features)
attr_names top_count top_count_xai \
31 Internet Use (hours per week) 6 3
5 Average time spent on FB 6 3
18 Everyday FB use 6 3
12 Cyclothymic temperament 6 3
4 Attitude about time on the Internet 6 3
3 Arts and culture 4 1
53 Surfing 4 3
35 Irritable temperament 4 3
30 Internet Use (hours per day) 4 2
47 Smoker 3 0
38 Online games 3 2
2 Anxiety temperament 3 1
48 Social networks 3 2
29 Hyperthymic temperament 2 0
0 Achievement 2 0
11 Computers and technology 1 0
41 Pornography 1 0
49 Sports 1 0
55 Travel/tourism 1 0
34 Internet for school 1 0
17 Energy drinks 1 1
9 Communication on the blog 0 0
39 Politics 0 0
13 Downloading music and movies 0 0
14 Drepressive temperament 0 0
15 Economic status 0 0
19 Expert Advice 0 0
20 FB use - publishing statuses 0 0
21 FB use - reading posts 0 0
22 FB use - sharing music, photos etc. 0 0
23 FB use – chatting 0 0
24 FB use – visiting groups 0 0
25 FB use –gaming 0 0
26 Fast Food 0 0
27 Gender 0 0
1 Alcohol 0 0
10 Communication on the forum 0 0
8 Communication by e-mail 0 0
42 Reading and downloading books and texts 0 0
7 Coffee 0 0
43 Reading the news 0 0
44 Religion 0 0
33 Internet Use (in years) 0 0
46 Search for favorite websites 0 0
37 Online courses 0 0
36 Music 0 0
50 Sports – days in a week 0 0
40 Pop culture 0 0
52 Sports – intensity 0 0
54 Targeted Internet search 0 0
45 Science 0 0
28 Health and medicine 0 0
32 Internet Use (in holiday) 0 0
16 Education 0 0
6 Business 0 0
51 Sports – in minutes 0 0
top_count_selection
31 3
5 3
18 3
12 3
4 3
3 3
53 1
35 1
30 2
47 3
38 1
2 2
48 1
29 2
0 2
11 1
41 1
49 1
55 1
34 1
17 0
9 0
39 0
13 0
14 0
15 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
1 0
10 0
8 0
42 0
7 0
43 0
44 0
33 0
46 0
37 0
36 0
50 0
40 0
52 0
54 0
45 0
28 0
32 0
16 0
6 0
51 0
In [41]:
df_top_features = df_top_features.sort_values(by=['top_count_selection'],ascending=False)
print(df_top_features)
attr_names top_count top_count_xai \
31 Internet Use (hours per week) 6 3
18 Everyday FB use 6 3
12 Cyclothymic temperament 6 3
4 Attitude about time on the Internet 6 3
3 Arts and culture 4 1
5 Average time spent on FB 6 3
47 Smoker 3 0
30 Internet Use (hours per day) 4 2
2 Anxiety temperament 3 1
29 Hyperthymic temperament 2 0
0 Achievement 2 0
11 Computers and technology 1 0
34 Internet for school 1 0
49 Sports 1 0
41 Pornography 1 0
55 Travel/tourism 1 0
48 Social networks 3 2
53 Surfing 4 3
35 Irritable temperament 4 3
38 Online games 3 2
43 Reading the news 0 0
44 Religion 0 0
33 Internet Use (in years) 0 0
46 Search for favorite websites 0 0
37 Online courses 0 0
36 Music 0 0
50 Sports – days in a week 0 0
52 Sports – intensity 0 0
40 Pop culture 0 0
54 Targeted Internet search 0 0
45 Science 0 0
28 Health and medicine 0 0
32 Internet Use (in holiday) 0 0
16 Education 0 0
6 Business 0 0
42 Reading and downloading books and texts 0 0
7 Coffee 0 0
21 FB use - reading posts 0 0
8 Communication by e-mail 0 0
10 Communication on the forum 0 0
17 Energy drinks 1 1
9 Communication on the blog 0 0
39 Politics 0 0
13 Downloading music and movies 0 0
14 Drepressive temperament 0 0
15 Economic status 0 0
19 Expert Advice 0 0
20 FB use - publishing statuses 0 0
22 FB use - sharing music, photos etc. 0 0
23 FB use – chatting 0 0
24 FB use – visiting groups 0 0
25 FB use –gaming 0 0
26 Fast Food 0 0
27 Gender 0 0
1 Alcohol 0 0
51 Sports – in minutes 0 0
top_count_selection
31 3
18 3
12 3
4 3
3 3
5 3
47 3
30 2
2 2
29 2
0 2
11 1
34 1
49 1
41 1
55 1
48 1
53 1
35 1
38 1
43 0
44 0
33 0
46 0
37 0
36 0
50 0
52 0
40 0
54 0
45 0
28 0
32 0
16 0
6 0
42 0
7 0
21 0
8 0
10 0
17 0
9 0
39 0
13 0
14 0
15 0
19 0
20 0
22 0
23 0
24 0
25 0
26 0
27 0
1 0
51 0
In [40]:
# calcualate statistics of important features for correctly classified instances
top_n_features = ['Internet Use (hours per week)',
'Average time spent on FB',
'Cyclothymic temperament',
'Attitude about time on the Internet',
'Irritable temperament',
'Surfing',
'Everyday FB use',
'Internet Use (hours per day)',
'Social networks',
'Depressive temperament'
]
relevance.get_important_features_statistics(predictions, x_test, top_n_features)
attr_names score
0 Internet Use (hours per week) 23
1 Average time spent on FB 22
2 Cyclothymic temperament 22
7 Internet Use (hours per day) 18
8 Social networks 16
4 Irritable temperament 10
5 Surfing 10
3 Attitude about time on the Internet 3
6 Everyday FB use 2
9 Depressive temperament 0
Number of tp instances 25.00. Mean number of relevant features per classification result 5.04.
attr_names score
0 Internet Use (hours per week) 34
7 Internet Use (hours per day) 33
1 Average time spent on FB 29
2 Cyclothymic temperament 25
4 Irritable temperament 24
8 Social networks 24
3 Attitude about time on the Internet 21
5 Surfing 19
6 Everyday FB use 12
9 Depressive temperament 0
Number of tn instances 46.00. Mean number of relevant features per classification result 4.80.
attr_names score
1 Average time spent on FB 8
0 Internet Use (hours per week) 7
2 Cyclothymic temperament 6
4 Irritable temperament 6
5 Surfing 6
8 Social networks 6
6 Everyday FB use 2
7 Internet Use (hours per day) 2
3 Attitude about time on the Internet 1
9 Depressive temperament 0
Number of fp instances 9.00. Mean number of relevant features per classification result 4.89.
attr_names score
0 Internet Use (hours per week) 13
7 Internet Use (hours per day) 10
1 Average time spent on FB 8
4 Irritable temperament 8
3 Attitude about time on the Internet 6
5 Surfing 6
8 Social networks 5
2 Cyclothymic temperament 4
6 Everyday FB use 3
9 Depressive temperament 0
Number of fn instances 16.00. Mean number of relevant features per classification result 3.94.